In [1]:
import altair as alt
import pandas as pd
import os
from toolz.curried import pipe
from vega_datasets import data
from altair import datum
# # Create a new data transformer that stores the files in a directory
# def json_dir(data, data_dir='altairdata'):
# os.makedirs(data_dir, exist_ok=True)
# return pipe(data, alt.to_json(filename=data_dir + '/{prefix}-{hash}.{extension}') )
# # Register and enable the new transformer
# alt.data_transformers.register('json_dir', json_dir)
# alt.data_transformers.enable('json_dir')
# Handle large data sets (default shows only 5000)
# See here: https://altair-viz.github.io/user_guide/data_transformers.html
alt.data_transformers.disable_max_rows()
# alt.renderers.enable('jupyterlab')
# alt.renderers.enable("jupyter", offline=True) # to enable these to work in the cbtf
# alt.renderers.enable("mimetype")
Out[1]:
DataTransformerRegistry.enable('default')
In [2]:
df = pd.read_csv("../../data/processed/cleaned_thermometer_data.csv", low_memory=True)
df.head(5)
Out[2]:
| Voting_Preference | State_Code_FIPS | Year_of_Study | Thermometer_Liberals | Thermometer_Conservatives | Thermometer_Gays_and_Lesbians | Thermometer_Feminists | |
|---|---|---|---|---|---|---|---|
| 0 | Democrat | 51.0 | 2020 | 85.0 | 30.0 | 85.0 | 85.0 |
| 1 | Democrat | 36.0 | 2020 | 80.0 | 50.0 | 97.0 | 65.0 |
| 2 | Major Third Party | 32.0 | 2020 | 70.0 | 50.0 | 70.0 | 70.0 |
| 3 | Republican | 18.0 | 2020 | 30.0 | 70.0 | 85.0 | 60.0 |
| 4 | Republican | 41.0 | 2020 | 50.0 | 50.0 | 45.0 | 45.0 |
In [3]:
df2 = df.drop(columns=['Voting_Preference'])
In [4]:
df_avg = df2.groupby(['State_Code_FIPS', 'Year_of_Study'], as_index=False).mean()
final plot¶
In [5]:
import altair as alt
import pandas as pd
import numpy as np
from vega_datasets import data
thermometer_columns = [
"Thermometer_Liberals",
"Thermometer_Conservatives",
"Thermometer_Gays_and_Lesbians",
"Thermometer_Feminists"
]
df_avg['Year_of_Study'] = df_avg['Year_of_Study'].astype(int)
df_melt = df_avg.melt(
id_vars=['State_Code_FIPS', 'Year_of_Study'],
value_vars=thermometer_columns,
var_name='Attribute',
value_name='Value'
)
df_wide = df_melt.pivot(
index='State_Code_FIPS',
columns=['Attribute', 'Year_of_Study'],
values='Value'
).reset_index()
df_wide.columns = [
col[0] if col[1] == ''
else f"{col[0]}_{col[1]}"
for col in df_wide.columns.to_flat_index()
]
all_wide_cols = [c for c in df_wide.columns]
year_values = sorted(df_avg['Year_of_Study'].unique(), key=lambda x: int(x))
slider = alt.binding_range(
min=2000, max=2020, step=4, name='Year: '
)
select_year = alt.selection_point(
fields=['Year_of_Study'],
bind=slider,
value=2020,
)
brush = alt.selection_interval(encodings=['x', 'y'])
select_point = alt.selection_point(fields=['Voting_Preference'], bind='legend', toggle=False)
highlight_state = alt.selection_point(
fields=['State_Code_FIPS'],
value=1,
empty='none'
)
select_attribute = alt.selection_point(
fields=['Attribute'],
bind='legend',
toggle=False
)
x_attribute_dropdown = alt.binding_select(
options=thermometer_columns,
name="Select X-Axis for scatter plot: "
)
select_x_attribute = alt.selection_point(
fields=["X_Attribute"],
bind=x_attribute_dropdown,
value="Thermometer_Liberals"
)
y_attribute_dropdown = alt.binding_select(
options=thermometer_columns,
name="Select Y-Axis for scatter plot: "
)
select_y_attribute = alt.selection_point(
fields=["Y_Attribute"],
bind=y_attribute_dropdown,
value="Thermometer_Gays_and_Lesbians"
)
states = alt.topo_feature(data.us_10m.url, 'states')
chart = (
alt.Chart(states)
.mark_geoshape()
.transform_lookup(
lookup='id',
from_=alt.LookupData(df_wide, 'State_Code_FIPS', all_wide_cols)
)
.transform_fold(
fold=all_wide_cols,
as_=['AttributeYear', 'Value']
)
.transform_calculate(
Year_of_Study="parseInt(substring(datum.AttributeYear, length(datum.AttributeYear) - 4, length(datum.AttributeYear)))",
Attribute="substring(datum.AttributeYear, 0, length(datum.AttributeYear) - 5)"
)
.transform_filter(select_year)
.transform_filter(select_attribute)
.encode(
stroke=alt.condition(
highlight_state,
alt.value('gray'),
alt.value('white')
),
strokeWidth=alt.condition(
highlight_state,
alt.value(5),
alt.value(2)
),
strokeOpacity=alt.condition(
highlight_state,
alt.value(1),
alt.value(0.1)
),
color=alt.Color(
'Value:Q',
title='Avg Score',
scale=alt.Scale(scheme='blueorange', domain=[0, 100])
),
tooltip=[
alt.Tooltip('id:O', title='State FIPS'),
alt.Tooltip('Value:Q', title='Avg Rating', format='.2f'),
alt.Tooltip('Year_of_Study:N', title='Year'),
alt.Tooltip('Attribute:N', title='Selected Attribute')
]
)
.project(type='albersUsa')
.properties(
width=800,
height=800,
title='Average Thermometer Ratings per US State'
)
.add_params(select_year, highlight_state, select_attribute)
)
regressio_plot = alt.Chart(df).transform_fold(
thermometer_columns,
as_=['Y_Attribute', 'Y_Value']
).transform_fold(
thermometer_columns,
as_=['X_Attribute', 'X_Value']
).transform_filter(
select_x_attribute
).transform_filter(
select_y_attribute
).mark_point().encode(
x=alt.X('X_Value:Q', title='Selected X-Axis Thermometer Rating'),
y=alt.Y('Y_Value:Q', title='Selected Y-Axis Thermometer Rating'),
color=alt.Color('Voting_Preference:N'),
opacity=alt.condition(select_point, alt.value(1), alt.value(0)),
tooltip=[
alt.Tooltip("State_Code_FIPS:N", title="State FIPS"),
alt.Tooltip("X_Attribute:N", title="Selected X Attribute"),
alt.Tooltip("Y_Attribute:N", title="Selected Y Attribute"),
alt.Tooltip("X_Value:Q", title="X-Axis Rating", format=".2f"),
alt.Tooltip("Y_Value:Q", title="Y-Axis Rating", format=".2f")
]
).transform_filter(
select_year
).add_params(select_point, select_x_attribute, select_y_attribute, brush
).transform_filter(
highlight_state
).properties(
width=250,
height=250,
title="Regression Plot of Thermometer Ratings (Dynamic Axes)"
)
bar_plot = alt.Chart(df).transform_fold(
thermometer_columns,
as_=['Y_Attribute', 'Y_Value']
).transform_fold(
thermometer_columns,
as_=['X_Attribute', 'X_Value']
).transform_filter(
select_x_attribute
).transform_filter(
select_y_attribute, brush
).mark_bar().encode(
x=alt.X('count()', title='Count'),
y=alt.Y("Voting_Preference"),
color=alt.Color('Voting_Preference:N'),
).transform_filter(
select_year
).add_params(select_point, select_x_attribute, select_y_attribute, brush
).transform_filter(
highlight_state
).properties(
width=250,
height=50
)
def compute_correlation(df):
corr_matrix = df[[ "Thermometer_Gays_and_Lesbians", "Thermometer_Feminists", "Thermometer_Liberals", "Thermometer_Conservatives"]].corr()
corr_df = corr_matrix.reset_index().melt(id_vars="index")
corr_df.columns = ["Attribute_X", "Attribute_Y", "Correlation"]
return corr_df
overall_corr = compute_correlation(df)
df_state_corr = df.groupby(["State_Code_FIPS", "Year_of_Study", "Voting_Preference"])[
["Thermometer_Gays_and_Lesbians", "Thermometer_Feminists", "Thermometer_Liberals", "Thermometer_Conservatives"]
].apply(compute_correlation).reset_index()
df_state_corr = df_state_corr.drop(columns=["level_3"])
heatmap = (alt.Chart(df_state_corr).mark_rect().encode(
alt.X("Attribute_X:N", title="Attribute"),
alt.Y("Attribute_Y:N", title="Attribute"),
alt.Color("Correlation:Q", title="Correlation", scale=alt.Scale(scheme="blueorange", domain=[-1, 1], reverse=False) ),
tooltip=[
alt.Tooltip("Attribute_X:N", title="Attribute 1"),
alt.Tooltip("Attribute_Y:N", title="Attribute 2"),
alt.Tooltip("Correlation:Q", title="Correlation", format=".2f"),
alt.Tooltip("State_Code_FIPS:N", title="State FIPS"),
alt.Tooltip("Year_of_Study:N", title="Year of Study"),
alt.Tooltip("Voting_Preference:N", title="Voting Preference")
]
).transform_filter(
select_year, select_point, highlight_state
).add_params(select_year, select_point)
.properties(
width=250,
height=250,
title="Correlation Matrix of Thermometer Ratings"
))
violin_plot = alt.Chart(df).transform_fold(
thermometer_columns,
as_=['Attribute', 'Value']
).transform_density(
density='Value',
as_=['Value', 'Density'],
extent=[0, 100],
groupby=["Attribute", "Year_of_Study", "State_Code_FIPS"]
).mark_area(orient='horizontal').encode(
alt.Y('Value:Q', title='Thermometer Ratings'),
alt.X('Density:Q',
stack='center',
impute=None,
title=None,
axis=alt.Axis(labels=False, values=[0], grid=False, ticks=True)
),
alt.Color('Attribute:N', title='Attribute'),
column=alt.Column('Attribute:N',
header=alt.Header(
titleOrient='bottom',
labelOrient='bottom',
labelPadding=0
),
title='Thermometer Attribute'
),
opacity=alt.condition(select_attribute, alt.value(1), alt.value(0.2))
).transform_filter(
select_year
).transform_filter(
highlight_state
).add_params( select_attribute
).properties(
width=200,
title="Distribution of Thermometer Ratings by Attribute"
)
spacer = alt.Chart(pd.DataFrame({'text': ['']})).mark_text().properties(height=70)
final_chart = (
chart | ( violin_plot & ( (regressio_plot & bar_plot) | heatmap))
).resolve_scale(
color='independent',
opacity='independent'
).resolve_legend(
color='independent',
opacity='independent'
)
final_chart
Out[5]:
In [ ]:
In [ ]:
In [ ]: